#’ Tasks

#’ 1. Carry an appropriate association rules mining, using the Apriori algorithm. Choose relevant minimum support and confidence values for this dataset. #’ 2. Select and comment a few rules that you may find more relevant, considering the main rules’ metrics: support, confidence, and lift.

#’ 3. Compile a commented report in PDF or HTML directly from RSudio.

#’ Note:

#’ When using a ‘wide’ dataset to create a transaction object, all #’ variables must either be logical or factor. Therefore, it is #’ recommended to: #’ #’ 1. Discretize all numerical variables (eg. age). Discretization #’ can be carried with the function discretize() from package arules. #’ 2. Factorize all categorical or text variables. Factorization can be #’ carried with function factor from R base. #’ 3. Set argument format = "wide" (default) when generating a #’ transactions object. Argument cols is not needed for this format.

#Clening cache

rm(list = ls())
graphics.off()

#Load Library

library(arules)
## Carregando pacotes exigidos: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(arulesViz)
library(readxl)

Load dataset

adult_income <- read_xlsx("C:/Users/fabia/OneDrive/Área de Trabalho/machine learning R/adult_income.xlsx")

#look your dados

dplyr::glimpse(adult_income)
## Rows: 48,842
## Columns: 10
## $ age            <dbl> 25, 38, 28, 44, 18, 34, 29, 63, 24, 55, 65, 36, 26, 58,…
## $ workclass      <chr> "Private", "Private", "Local-gov", "Private", NA, "Priv…
## $ education      <chr> "11th", "HS-grad", "Assoc-acdm", "Some-college", "Some-…
## $ marital.status <chr> "Never-married", "Married-civ-spouse", "Married-civ-spo…
## $ occupation     <chr> "Machine-op-inspct", "Farming-fishing", "Protective-ser…
## $ relationship   <chr> "Own-child", "Husband", "Husband", "Husband", "Own-chil…
## $ race           <chr> "Black", "White", "White", "Black", "White", "White", "…
## $ gender         <chr> "Male", "Male", "Male", "Male", "Female", "Male", "Male…
## $ native.country <chr> "United-States", "United-States", "United-States", "Uni…
## $ income         <chr> "<=50K", "<=50K", ">50K", ">50K", "<=50K", "<=50K", "<=…

#’ Note that this dataset is presented in a “long” (ie. horizontal) format

head(adult_income, 15)
## # A tibble: 15 × 10
##      age workclass education marital.status occupation relationship race  gender
##    <dbl> <chr>     <chr>     <chr>          <chr>      <chr>        <chr> <chr> 
##  1    25 Private   11th      Never-married  Machine-o… Own-child    Black Male  
##  2    38 Private   HS-grad   Married-civ-s… Farming-f… Husband      White Male  
##  3    28 Local-gov Assoc-ac… Married-civ-s… Protectiv… Husband      White Male  
##  4    44 Private   Some-col… Married-civ-s… Machine-o… Husband      Black Male  
##  5    18 <NA>      Some-col… Never-married  <NA>       Own-child    White Female
##  6    34 Private   10th      Never-married  Other-ser… Not-in-fami… White Male  
##  7    29 <NA>      HS-grad   Never-married  <NA>       Unmarried    Black Male  
##  8    63 Self-emp… Prof-sch… Married-civ-s… Prof-spec… Husband      White Male  
##  9    24 Private   Some-col… Never-married  Other-ser… Unmarried    White Female
## 10    55 Private   7th-8th   Married-civ-s… Craft-rep… Husband      White Male  
## 11    65 Private   HS-grad   Married-civ-s… Machine-o… Husband      White Male  
## 12    36 Federal-… Bachelors Married-civ-s… Adm-cleri… Husband      White Male  
## 13    26 Private   HS-grad   Never-married  Adm-cleri… Not-in-fami… White Female
## 14    58 <NA>      HS-grad   Married-civ-s… <NA>       Husband      White Male  
## 15    48 Private   HS-grad   Married-civ-s… Machine-o… Husband      White Male  
## # ℹ 2 more variables: native.country <chr>, income <chr>

discretize()

adult_income$age_discrete <- discretize(adult_income$age)

#dados factor

adult_income3 <- lapply(adult_income, function(x) {
  if(is.character(x) || is.factor(x)) {
    return(factor(x))
  } else {
    return(x)
  }
})

#’ In this case, our dataset is presented in a “wide” format. #’ Therefore we must set argument ‘format’ to “wide”, #’ and name which columns contain ‘TransactionID’ and ‘Item’.

#' In this case, our dataset is presented in a "wide" format.
#' Therefore we must set argument 'format' to "wide",
#' and name which columns contain 'TransactionID' and 'Item'.
transactions <- arules::transactions(adult_income,
                                     format = "wide")
## Warning: Column(s) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 not logical or factor.
## Applying default discretization (see '? discretizeDF').
transactions
## transactions in sparse format with
##  48842 transactions (rows) and
##  107 items (columns)
summary(transactions)
## transactions as itemMatrix in sparse format with
##  48842 rows (elements/itemsets/transactions) and
##  107 columns (items) and a density of 0.1015667 
## 
## most frequent items:
## native.country=United-States                   race=White 
##                        43832                        41762 
##                 income=<=50K            workclass=Private 
##                        37155                        33906 
##                  gender=Male                      (Other) 
##                        32650                       341492 
## 
## element (itemset/transaction) length distribution:
## sizes
##     8     9    10    11 
##    46  2753   821 45222 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.00   11.00   11.00   10.87   11.00   11.00 
## 
## includes extended item information - examples:
##        labels variables  levels
## 1 age=[17,31)       age [17,31)
## 2 age=[31,44)       age [31,44)
## 3 age=[44,90]       age [44,90]
## 
## includes extended transaction information - examples:
##   transactionID
## 1             1
## 2             2
## 3             3
#' Plot most frequent items in a barplot
itemFrequencyPlot(transactions,
                  topN = 15,
                  type = 'absolute',
                  horiz = T,
                  cex.names = 0.8)

rules <- apriori(transactions,
                 parameter = list(supp = 0.01,   # minimum Support
                                  conf = 0.60,   # minimum Confidence
                                  minlen = 2))   # minimum length of items (lhs+rhs)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.6    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 488 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[107 item(s), 48842 transaction(s)] done [0.02s].
## sorting and recoding items ... [59 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(transactions, parameter = list(supp = 0.01, conf = 0.6, :
## Mining stopped (maxlen reached). Only patterns up to a length of 10 returned!
##  done [0.10s].
## writing ... [84887 rule(s)] done [0.01s].
## creating S4 object  ... done [0.12s].
summary(rules)
## set of 84887 rules
## 
## rule length distribution (lhs + rhs):sizes
##     2     3     4     5     6     7     8     9    10 
##   278  3042 11738 22461 24206 15623  6089  1327   123 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   5.000   6.000   5.718   7.000  10.000 
## 
## summary of quality measures:
##     support          confidence        coverage            lift        
##  Min.   :0.01001   Min.   :0.6000   Min.   :0.01001   Min.   : 0.7138  
##  1st Qu.:0.01267   1st Qu.:0.8229   1st Qu.:0.01448   1st Qu.: 1.0435  
##  Median :0.01732   Median :0.9204   Median :0.02011   Median : 1.2851  
##  Mean   :0.02814   Mean   :0.8881   Mean   :0.03237   Mean   : 1.7108  
##  3rd Qu.:0.02905   3rd Qu.:0.9968   3rd Qu.:0.03307   3rd Qu.: 2.4070  
##  Max.   :0.78811   Max.   :1.0000   Max.   :0.89742   Max.   :20.6605  
##      count      
##  Min.   :  489  
##  1st Qu.:  619  
##  Median :  846  
##  Mean   : 1375  
##  3rd Qu.: 1419  
##  Max.   :38493  
## 
## mining info:
##          data ntransactions support confidence
##  transactions         48842    0.01        0.6
##                                                                                 call
##  apriori(data = transactions, parameter = list(supp = 0.01, conf = 0.6, minlen = 2))
#' Rules can be seen with 'inspect()' function
#' eg. show the first N rules
inspect(rules[1:5])
##     lhs                                       rhs                    support confidence   coverage      lift count
## [1] {education=Doctorate}                  => {race=White}        0.01076942  0.8855219 0.01216166 1.0356463   526
## [2] {marital.status=Married-spouse-absent} => {income=<=50K}      0.01167028  0.9076433 0.01285779 1.1931399   570
## [3] {education=12th}                       => {workclass=Private} 0.01048278  0.7792998 0.01345154 1.1225908   512
## [4] {education=12th}                       => {income=<=50K}      0.01246878  0.9269406 0.01345154 1.2185072   609
## [5] {education=12th}                       => {race=White}        0.01054420  0.7838661 0.01345154 0.9167565   515
#' Sort rules by descending metric of interest,
#' and inspect the top n few.
rules <- sort(rules, by = 'lift', decreasing = TRUE)
# options(digits = 4)  # to reduce the number of digits in the output
inspect(rules[1:15])
##      lhs                                     rhs                    support confidence   coverage     lift count
## [1]  {marital.status=Married-civ-spouse,                                                                        
##       gender=Female,                                                                                            
##       native.country=United-States,                                                                             
##       income=>50K,                                                                                              
##       age_discrete=[31,44)}               => {relationship=Wife} 0.01011425  0.9860279 0.01025757 20.66048   494
## [2]  {age=[31,44),                                                                                              
##       marital.status=Married-civ-spouse,                                                                        
##       gender=Female,                                                                                            
##       native.country=United-States,                                                                             
##       income=>50K}                        => {relationship=Wife} 0.01011425  0.9860279 0.01025757 20.66048   494
## [3]  {age=[31,44),                                                                                              
##       marital.status=Married-civ-spouse,                                                                        
##       gender=Female,                                                                                            
##       native.country=United-States,                                                                             
##       income=>50K,                                                                                              
##       age_discrete=[31,44)}               => {relationship=Wife} 0.01011425  0.9860279 0.01025757 20.66048   494
## [4]  {marital.status=Married-civ-spouse,                                                                        
##       gender=Female,                                                                                            
##       income=>50K,                                                                                              
##       age_discrete=[31,44)}               => {relationship=Wife} 0.01124033  0.9821109 0.01144507 20.57840   549
## [5]  {age=[31,44),                                                                                              
##       marital.status=Married-civ-spouse,                                                                        
##       gender=Female,                                                                                            
##       income=>50K}                        => {relationship=Wife} 0.01124033  0.9821109 0.01144507 20.57840   549
## [6]  {age=[31,44),                                                                                              
##       marital.status=Married-civ-spouse,                                                                        
##       gender=Female,                                                                                            
##       income=>50K,                                                                                              
##       age_discrete=[31,44)}               => {relationship=Wife} 0.01124033  0.9821109 0.01144507 20.57840   549
## [7]  {workclass=Private,                                                                                        
##       marital.status=Married-civ-spouse,                                                                        
##       race=White,                                                                                               
##       gender=Female,                                                                                            
##       native.country=United-States,                                                                             
##       income=>50K}                        => {relationship=Wife} 0.01103558  0.9782214 0.01128127 20.49691   539
## [8]  {marital.status=Married-civ-spouse,                                                                        
##       race=White,                                                                                               
##       gender=Female,                                                                                            
##       native.country=United-States,                                                                             
##       income=>50K}                        => {relationship=Wife} 0.01809918  0.9778761 0.01850866 20.48967   884
## [9]  {marital.status=Married-civ-spouse,                                                                        
##       gender=Female,                                                                                            
##       native.country=United-States,                                                                             
##       income=>50K}                        => {relationship=Wife} 0.01998280  0.9769770 0.02045371 20.47083   976
## [10] {workclass=Private,                                                                                        
##       marital.status=Married-civ-spouse,                                                                        
##       gender=Female,                                                                                            
##       native.country=United-States,                                                                             
##       income=>50K}                        => {relationship=Wife} 0.01199787  0.9766667 0.01228451 20.46433   586
## [11] {workclass=Private,                                                                                        
##       marital.status=Married-civ-spouse,                                                                        
##       race=White,                                                                                               
##       gender=Female,                                                                                            
##       income=>50K}                        => {relationship=Wife} 0.01189550  0.9764706 0.01218214 20.46022   581
## [12] {marital.status=Married-civ-spouse,                                                                        
##       race=White,                                                                                               
##       gender=Female,                                                                                            
##       income=>50K}                        => {relationship=Wife} 0.01943000  0.9763374 0.01990090 20.45743   949
## [13] {workclass=Private,                                                                                        
##       marital.status=Married-civ-spouse,                                                                        
##       race=White,                                                                                               
##       gender=Female,                                                                                            
##       native.country=United-States,                                                                             
##       age_discrete=[31,44)}               => {relationship=Wife} 0.01044183  0.9751434 0.01070800 20.43241   510
## [14] {age=[31,44),                                                                                              
##       workclass=Private,                                                                                        
##       marital.status=Married-civ-spouse,                                                                        
##       race=White,                                                                                               
##       gender=Female,                                                                                            
##       native.country=United-States}       => {relationship=Wife} 0.01044183  0.9751434 0.01070800 20.43241   510
## [15] {age=[31,44),                                                                                              
##       workclass=Private,                                                                                        
##       marital.status=Married-civ-spouse,                                                                        
##       race=White,                                                                                               
##       gender=Female,                                                                                            
##       native.country=United-States,                                                                             
##       age_discrete=[31,44)}               => {relationship=Wife} 0.01044183  0.9751434 0.01070800 20.43241   510
#' Explore by searching specific items, either on
#'   the Antecedent (lhs: left-hand side) or 
#'   the Consequent (rhs: right-hand side)
#' 
#' Useful matching operators:
#'   %in%  : select itemsets matching any given item
#'   %ain% : select itemsets matching ALL given items
#'   %oin% : select itemsets matching ONLY the given item
#'   %pin% : equivalent to %in% with PARTIAL matching

rules_filtered <- subset(rules, subset = rhs %pin% "heart")
inspect(rules_filtered)

Inspect interactively (Viewer)

arulesViz::inspectDT(rules)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

Explore rules (Shiny)

arulesViz::ruleExplorer(rules[1:10])
## ruleExplorer started.
## PhantomJS not found. You can install it with webshot::install_phantomjs(). If it is installed, please make sure the phantomjs executable can be found via the PATH variable.
Shiny applications not supported in static R Markdown documents

Conclusion: After analyzing using the filter, it was very interesting to check out some associations. One that caught my attention was people with

income=>50K have eduaction=Bachelors

rules_filtered <- subset(rules, subset = rhs %pin% "income=>50K")
inspect(rules_filtered[1:6])
##     lhs                                     rhs              support confidence   coverage     lift count
## [1] {education=Prof-school,                                                                              
##      marital.status=Married-civ-spouse}  => {income=>50K} 0.01031899  0.8456376 0.01220261 3.534066   504
## [2] {workclass=Private,                                                                                  
##      education=Bachelors,                                                                                
##      occupation=Exec-managerial,                                                                         
##      relationship=Husband,                                                                               
##      native.country=United-States}       => {income=>50K} 0.01138365  0.8212703 0.01386102 3.432231   556
## [3] {workclass=Private,                                                                                  
##      education=Bachelors,                                                                                
##      marital.status=Married-civ-spouse,                                                                  
##      occupation=Exec-managerial,                                                                         
##      relationship=Husband,                                                                               
##      native.country=United-States}       => {income=>50K} 0.01138365  0.8212703 0.01386102 3.432231   556
## [4] {workclass=Private,                                                                                  
##      education=Bachelors,                                                                                
##      occupation=Exec-managerial,                                                                         
##      relationship=Husband,                                                                               
##      gender=Male,                                                                                        
##      native.country=United-States}       => {income=>50K} 0.01138365  0.8212703 0.01386102 3.432231   556
## [5] {workclass=Private,                                                                                  
##      education=Bachelors,                                                                                
##      marital.status=Married-civ-spouse,                                                                  
##      occupation=Exec-managerial,                                                                         
##      relationship=Husband,                                                                               
##      gender=Male,                                                                                        
##      native.country=United-States}       => {income=>50K} 0.01138365  0.8212703 0.01386102 3.432231   556
## [6] {workclass=Private,                                                                                  
##      education=Bachelors,                                                                                
##      occupation=Exec-managerial,                                                                         
##      relationship=Husband,                                                                               
##      race=White,                                                                                         
##      native.country=United-States}       => {income=>50K} 0.01095369  0.8192956 0.01336964 3.423978   535